DONE.
TODO.
import pandas as pd
import csv
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (14, 6)
%load_ext autoreload
%autoreload 2
fields = ["timestamp", "pickup_timestamp", "return_timestamp", "broker_contract_id"]
df = pd.read_csv("rate_quote.csv", skipinitialspace=True, usecols=fields)
print(df.shape)
df["timestamp"] = pd.to_datetime(df['timestamp'])
df["pickup_timestamp"] = pd.to_datetime(df['pickup_timestamp'])
df["return_timestamp"] = pd.to_datetime(df['return_timestamp'])
print(df.dtypes)
Map contracts to certain broker
fields = ["Contract_ID", "Contract_name", "Broker name"]
contract_df = pd.read_csv("broker_contracts.csv", skipinitialspace=True, usecols=fields)
contract_df["Broker name"] = contract_df["Broker name"].astype(str)
contract_df["Contract_name"] = contract_df["Contract_name"].astype(str)
# This holds contract names and which broker name corresponds to each contract id..
#contract_df = contract_df.set_index("Contract_ID")
print(contract_df.shape)
print(contract_df.dtypes)
# What different contractors are there?
contractors = set(contract_df["Broker name"])
print("Different contractors:", contractors)
# For broker, find which contract ids it has
from collections import defaultdict
broker_contract_ids = defaultdict(set)
for row in contract_df.itertuples():
broker_contract_ids[row[3]].add(row[0])
contract_df.columns = ["broker_contract_id", "Contract_name", "Broker_name"]
#print(df)
merged_requests = pd.merge(df, contract_df, on=["broker_contract_id"])
print(merged_requests.dtypes)
# Create new, merged table, with contractor name in each.
contract_df
t = 2e6
t = 1
print(df["timestamp"][t])
print(df["pickup_timestamp"][t])
print(df["return_timestamp"][t])
# requests per datetime
df_requests = pd.DataFrame(df["timestamp"])
requests_per_day = df_requests.groupby(df_requests["timestamp"].dt.date).count()
requests_per_day.plot.bar()
reqs_separately = merged_requests[["timestamp", "Broker_name"]]
requests_per_contract_ = reqs_separately.groupby(reqs_separately["Broker_name"])
requests_per_contractor = requests_per_contract_.count().plot(kind="bar", title="Request count by contractor")
requests_per_contractor.set_ylabel("Request count")
requests_per_contractor = reqs_separately.groupby(reqs_separately["Broker_name"]).count().plot(kind="bar", title="Request count by contractor in log scale")
requests_per_contractor.set_yscale("log", nonposy='clip')
requests_per_contractor.set_ylabel("Request count")
contractors = []
for group in requests_per_contract_:
# print(group[1])
time_contract = pd.DataFrame(group[1]["timestamp"])
contractors.append((group[0], time_contract.groupby(time_contract["timestamp"].dt.date).count()))
plt = time_contract.groupby(time_contract["timestamp"].dt.date).count().plot(
kind="bar", title="Requests per day for contractor " + str(group[0])
)
plt.set_ylabel("Request count")
plt.set_xlabel("Request timestamp")
contractors
l = []
for con in contractors:
for row in con[1].itertuples():
l.append([con[0], row[0], row[1]])
df_contractor_dreq = pd.DataFrame(l)
df_contractor_dreq.columns = ["contractor", "date", "count"]
print(df_contractor_dreq)
#import importlib
#importlib.reload(seaborn)
import seaborn as sns
import matplotlib.pyplot as plt
plt.ylim(10, 40)
sns.barplot(x='contractor', y='count', hue='date', data=df_contractor_dreq, log=True)
plt.ylim(1, 10e6)
TODO: Make a different plot on how many days ahead the request was made...
# group data by day
daylist = []
days = []
for group in df.groupby(df["timestamp"].dt.date):
days.append(group[0])
daylist.append(group[1])
for i, day in enumerate(daylist):
day_pickup = pd.DataFrame(day["pickup_timestamp"])
plt = day_pickup.groupby(day_pickup["pickup_timestamp"].dt.date).count().plot(
kind="line", title="Request pickup times for " + str(days[i]))
plt.set_ylabel("Count")
plt.set_xlabel("Requested pickup time")
requests = df_requests.reset_index().set_index("timestamp")
resampled_requests_plot = requests.resample("10Min").count().plot(kind="line", title="Number of requests in time in period from 1st December to end of 3rd December (interval 10 min)")
resampled_requests_plot.set_xlabel("Request timestamp")
resampled_requests_plot.set_ylabel("Number of requests")
resampled_requests_plot = requests.resample("30Min").count().plot(kind="line", title="Number of requests in time in period from 1st December to end of 3rd December (interval 30 min)")
resampled_requests_plot.set_xlabel("Request timestamp")
resampled_requests_plot.set_ylabel("Number of requests")
for i, day in enumerate(daylist):
reqs = pd.DataFrame(day["timestamp"]).reset_index().set_index("timestamp")
req_plt = reqs.resample("5Min").count().plot(kind="line", title="Number of requests on " + str(days[i]) + " (interval 5 min)")
req_plt.set_ylabel("Number of requests")
for i, day in enumerate(daylist):
reqs = pd.DataFrame(day["timestamp"]).reset_index().set_index("timestamp")
req_plt = reqs.resample("15Min").count().plot(kind="line", title="Number of requests on " + str(days[i]) + " (interval 15 min)")
req_plt.set_ylabel("Number of requests")
For each request we have pickup date and return date. Lets assume we have a request with pickup_date 27.11.2017 and return date 31.11.2017, then such request will add one value to each following day 27,28,29,30,31 (every day inbetween and including 27 and 31).
With this we can get some insight to which dates would have probably the largest number of active reservations
from collections import defaultdict
# For each day find most hot dates (which are most frequently in an interval). TODO: implement faster solution.
iteration_count = 0
day_counts = []
for day in daylist:
counts = defaultdict(int)
for row in day.itertuples():
for date in pd.date_range(row[2], row[3], normalize=True): # Normalize - to midnight.
counts[date] += 1
iteration_count += 1
day_counts.append(counts)
for i in range(3):
time_series_1 = pd.DataFrame(pd.Series(day_counts[i]))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.Grouper(freq="D")).sum().plot(kind="line", title="Popularity by day for " + str(days[i]))
df1.set_ylabel("Number of quotes on a day")
df2 = time_series_1.groupby(pd.Grouper(freq="W")).sum().plot(kind="line", title="Popularity by week for " + str(days[i]))
df2.set_ylabel("Number of quotes on a day")
counts = defaultdict(int)
for row in df.itertuples():
for date in pd.date_range(row[2], row[3], normalize=True): # Normalize - to midnight.
counts[date] += 1
time_series_1 = pd.DataFrame(pd.Series(counts))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.TimeGrouper("D")).sum().plot(kind="line", title="Hottest days by day (on request data)")
df1.set_ylabel("Number of reservations on a day")
df2 = time_series_1.groupby(pd.TimeGrouper("W")).sum().plot(kind="line", title="Hottest by week (on request data)")
df2.set_ylabel("Number of reservations on a day")
https://www.geeksforgeeks.org/find-the-point-where-maximum-intervals-overlap/
# sort pickup and returns
# TODO: Implement this, will be quick.
pickup_sort = df["pickup_timestamp"].dt.normalize().sort_values().reset_index(drop=True)
return_sort = df["return_timestamp"].dt.normalize().sort_values().reset_index(drop=True)
start = pickup_sort[0]
end = return_sort[len(df)-1]
pickup_ind = 0
return_ind = 0
opened_bracket_count = 0
counts_new = {}
length = len(df)
# For every date between first pickup_rate and last return_date find how many open reservations would be based on price requests.
# I wonder if date comparison is slow, it could probably be faster....
for date in pd.date_range(start, end, normalize=True):
while opened_bracket_count >= 1 and return_sort[return_ind] < date:
opened_bracket_count -= 1
return_ind += 1
while pickup_ind < length and pickup_sort[pickup_ind] <= date:
opened_bracket_count += 1
pickup_ind += 1
counts_new[date] = opened_bracket_count
def count_active_on_day(df):
# Algorithm to find how many requests are made for a certain day.
# Accepts a dataframe with pickup dates and return dates (columns pickup_timestamp and return_timestamp)
# Effective version based on this: https://www.geeksforgeeks.org/find-the-point-where-maximum-intervals-overlap/
# Returns a dictionary with key being date and values being how many requests for that date.
pickup_sort = df["pickup_timestamp"].dt.normalize().sort_values().reset_index(drop=True)
return_sort = df["return_timestamp"].dt.normalize().sort_values().reset_index(drop=True)
start = pickup_sort[0]
end = return_sort[len(df)-1]
pickup_ind = 0
return_ind = 0
opened_bracket_count = 0
counts_new = {}
length = len(df)
# For every date between first pickup_rate and last return_date find how many open reservations would be based on price requests.
# I wonder if date comparison is slow, it could probably be faster....
for date in pd.date_range(start, end, normalize=True):
while opened_bracket_count >= 1 and return_sort[return_ind] < date:
opened_bracket_count -= 1
return_ind += 1
while pickup_ind < length and pickup_sort[pickup_ind] <= date:
opened_bracket_count += 1
pickup_ind += 1
counts_new[date] = opened_bracket_count
return counts_new
# For each contractor find hot dates.
pickups_per_broker = merged_requests[["pickup_timestamp", "return_timestamp", "Broker_name"]]
broker_dict = {}
for group in pickups_per_broker.groupby(pickups_per_broker["Broker_name"]):
broker_dict[group[0]] = count_active_on_day(group[1])
for k,v in broker_dict.items():
time_series_1 = pd.DataFrame(pd.Series(v))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.Grouper(freq="D")).sum().plot(kind="line", title="Hottest days for "+ k +" by day (on request data)")
df1.set_ylabel("Number of reservations on a day")
df2 = time_series_1.groupby(pd.Grouper(freq="W")).sum().plot(kind="line", title="Hottest days for " + k + " by week (on request data)")
df2.set_ylabel("Number of reservations on a day")
# Load in huge file by chunks, pick 3 columns and append to another file..
filename = "rate_quote.csv"
outputfile = "trimmed_rate_quote.csv"
fields = ["timestamp", "pickup_timestamp", "return_timestamp"]
chunksize = 300000
first_save = True
of = open(outputfile, "a")
for chunk in pd.read_csv(filename, chunksize=chunksize, skipinitialspace=True, usecols=fields):
chunk.to_csv(of)
of.close()
Currently not caring about duplicate, cancelled or not-finished reservations.
fields = ["rate_request_timestamp", "reservation_request_timestamp", "pickup_timestamp", "return_timestamp"]
df = pd.read_csv("reservations.csv", skipinitialspace=True, usecols=fields)
print(df.shape)
df["rate_request_timestamp"] = pd.to_datetime(df['rate_request_timestamp'])
df["reservation_request_timestamp"] = pd.to_datetime(df['reservation_request_timestamp'])
df["pickup_timestamp"] = pd.to_datetime(df['pickup_timestamp'])
df["return_timestamp"] = pd.to_datetime(df['return_timestamp'])
print(df.dtypes)
df_reservation_requests = df["reservation_request_timestamp"]
reservation_requests = df_reservation_requests.reset_index().set_index("reservation_request_timestamp")
resampled_requests_plot = reservation_requests.resample("1D").count().plot(kind="line", title="Number of reservation requests in time")
resampled_requests_plot.set_xlabel("Request timestamp")
resampled_requests_plot.set_ylabel("Number of requests")
resampled_requests_plot_week = reservation_requests.resample("1W").count().plot(kind="line", title="Number of reservation requests in time")
resampled_requests_plot_week.set_xlabel("Request timestamp")
resampled_requests_plot_week.set_ylabel("Number of requests")
df_pickup_times = df["pickup_timestamp"]
pickup_times = df_pickup_times.reset_index().set_index("pickup_timestamp")
resampled_requests_plot = pickup_times.resample("1D").count().plot(kind="line", title="Number of pickup times on a date (daily) in reservation data")
resampled_requests_plot.set_xlabel("Pickup timestamp")
resampled_requests_plot.set_ylabel("Number of requests")
resampled_requests_plot_week = pickup_times.resample("1W").count().plot(kind="line", title="Number of pickup times on a date (weekly) in reservation data")
resampled_requests_plot_week.set_xlabel("Pickup timestamp")
resampled_requests_plot_week.set_ylabel("Number of requests")
resampled_requests_plot_month = pickup_times.resample("1M").count().plot(kind="line", title="Number of pickup times on a date (monthly) in reservation data")
resampled_requests_plot_month.set_xlabel("Pickup timestamp")
resampled_requests_plot_month.set_ylabel("Number of requests")
The same as in price quote data. TODO: This also has to be replaced with more effective code.
# Find date most hot days (which are most frequently in an interval). TODO: implement faster solution.
iteration_count = 0
counts = defaultdict(int)
for row in df.itertuples():
for date in pd.date_range(row[1], row[2], normalize=True):
counts[date] += 1
#print(day_counts)
time_series_1 = pd.DataFrame(pd.Series(counts))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.Grouper(freq="D")).sum().plot(kind="line", title="Hottest days by day")
df1.set_ylabel("Number of reservations on a day")
df2 = time_series_1.groupby(pd.Grouper(freq="W")).sum().plot(kind="line", title="Hottest by week")
df2.set_ylabel("Number of reservations on a day")
fields = ["rate_request_timestamp", "reservation_request_timestamp", "pickup_timestamp", "return_timestamp", "Broker name"]
df_res = pd.read_csv("reservations.csv", skipinitialspace=True, usecols=fields)
print(df_res.shape)
df_res["rate_request_timestamp"] = pd.to_datetime(df_res['rate_request_timestamp'])
df_res["reservation_request_timestamp"] = pd.to_datetime(df_res['reservation_request_timestamp'])
df_res["pickup_timestamp"] = pd.to_datetime(df_res['pickup_timestamp'])
df_res["return_timestamp"] = pd.to_datetime(df_res['return_timestamp'])
print(df_res.dtypes)
# For each contractor find hot dates.
broker_dict_res = {}
for group in df_res.groupby(df_res["Broker name"]):
broker_dict_res[group[0]] = count_active_on_day(group[1])
for k,v in broker_dict_res.items():
time_series_1 = pd.DataFrame(pd.Series(v))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.Grouper(freq="D")).sum().plot(kind="line", title="Hottest days for "+ k +" by day (on reservation data)")
df1.set_ylabel("Number of reservations on a day")
df2 = time_series_1.groupby(pd.Grouper(freq="W")).sum().plot(kind="line", title="Hottest days for " + k + " by week (on reservation data)")
df2.set_ylabel("Number of reservations on a day")
This data also has broker name
fields = ["timestamp", "pickup_timestamp", "return_timestamp", "Broker_name"]
df_dups = pd.read_csv("rate_quote_1_dup1.csv", skipinitialspace=True, usecols=fields)
print(df_dups.shape)
df_dups["timestamp"] = pd.to_datetime(df_dups['timestamp'])
df_dups["pickup_timestamp"] = pd.to_datetime(df_dups['pickup_timestamp'])
df_dups["return_timestamp"] = pd.to_datetime(df_dups['return_timestamp'])
print(df_dups.dtypes)
# For each contractor see how many requests per day..
contractors_dups = []
for group in df_dups.groupby(df_dups["Broker_name"]):
time_contract = pd.DataFrame(group[1]["timestamp"])
contractors_dups.append((group[0], time_contract.groupby(time_contract["timestamp"].dt.date).count()))
l_dups = []
for con in contractors_dups:
for row in con[1].itertuples():
l_dups.append([con[0], row[0], row[1]])
df_con_dups = pd.DataFrame(l_dups)
df_con_dups.columns = ["contractor", "date", "count"]
import seaborn as sns
import matplotlib.pyplot as plt
plt.ylim(10, 40)
sns.barplot(x='contractor', y='count', hue='date', data=df_con_dups, log=True)
plt.ylim(1, 10e6)
# For each contractor find hot dates.
broker_dict_dups = {}
for group in df_dups.groupby(df_dups["Broker_name"]):
broker_dict_dups[group[0]] = count_active_on_day(group[1])
for k,v in broker_dict_dups.items():
time_series_1 = pd.DataFrame(pd.Series(v))
time_series_1.index.name = "timestamp"
time_series_1.columns = ["count"]
df1=time_series_1.groupby(pd.Grouper(freq="D")).sum().plot(kind="line", title="Hottest days for "+ k +" by day (on w/o dups request data)")
df1.set_ylabel("Number of reservations on a day")
df2 = time_series_1.groupby(pd.Grouper(freq="W")).sum().plot(kind="line", title="Hottest days for " + k + " by week (on w/o dups request data)")
df2.set_ylabel("Number of reservations on a day")